Time Series Characterisation
!pip install -r requirements.txt
import numpy as np
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import statsmodels.robust
from statsmodels import api as sm
import statsmodels.tsa.stattools as tsa
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import os
from math import sqrt
import matplotlib
import matplotlib.pyplot as plt
matplotlib.colors
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'k'
import seaborn as sns
from random import random
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_squared_log_error
import warnings; import os
warnings.filterwarnings("ignore")
plt.style.available
plt.style.use('bmh')
from pylab import rcParams
product = 'engine_parts' # Create a dictionary to store product code and sitc code
experiment = 'rca_tests'
def load_data(csv_name,trade_form,target_value, code_system,sitc_code,ts_type):
'''
Ingests SITC multi-time series trade data in csv format, filters for one product code, removes zero values for
target value, screens target values and renames target columns & creates dataframe for the target columns
Parameters:
-----------
csv:
trade_form:
target_value:
code_system:
sitc_code:
ts_type:
'''
data = pd.read_csv(csv_name)#,header=None
# TO DO: Create for loop to read sitc code from dictionary
dframe = data.loc[data[code_system]==sitc_code]
dframe.rename(columns={'location_name_short_en': 'exporter','export_rca': 'rca'}, inplace=True)
dframe = dframe[[ts_type,trade_form,target_value]]
dframe = dframe[dframe[trade_form] != 'Undeclared Countries']
dframe[target_value].replace('',0, inplace=True)
#dframe = dframe[dframe[target_value] != 0]
dframe = dframe[dframe[target_value] >= 0.01]
return dframe
xdf = load_data('sitc4digit_year.csv','exporter','rca','sitc_product_code',7132,'year')
xdf.columns
xdf.describe()
exporters = xdf['exporter'].unique()
time = xdf['year'].unique()
print('Shape of the dataframe: {}'.format(xdf.shape)) # Full dataset is 8012 entries
print('Data ranges from {} to {}'.format(xdf.year.min(),xdf.year.max()))
print('The rca values range from {} to {}'.format(xdf.rca.min(),xdf.rca.max()))
print('Countries not declared are {} out of {}'.format(xdf['exporter'].loc[xdf['exporter']=='Undeclared Countries'].count(),xdf.exporter.count()))
print('Zero export values: {}'.format(xdf['rca'][xdf['rca']==0].count()))
print('Null export values: {}'.format(xdf['rca'].isnull().sum()))
Converting to datetime format:
xdf['year'] = pd.to_datetime(xdf['year'], format='%Y')
xdf.columns
#xdf = xdf.drop(columns=["rca"])
#xdf.columns
#sns.scatterplot(x='year',y='rca',data=xdf)
#sns.pairplot(xdf)
fig, ax = plt.subplots(figsize=(30,15))
for key, grp in xdf.groupby(['exporter']):
ax = grp.plot(ax=ax, kind='line', x='year', y='rca', label=key, marker='o',title='Exports for {}'.format(product))
ax.legend(ncol=8, loc='best')
target = 'images\\{}\\{}'.format(product,experiment)
savefile = os.path.join(os.getcwd(),target,'Exports for {}'.format(product))
plt.savefig(savefile)
df_grp = xdf.set_index('year')
df_grp.head(2)
df_grp = xdf.groupby([xdf.year.name, xdf.exporter.name]).mean().unstack()
df_grps = df_grp.apply(pd.to_numeric, errors='coerce').fillna(0, downcast='infer')
df_grps.columns = df_grps.columns.droplevel()
df_grps.head(1)
Setting index as the datetime column for easier manipulations:
Image storage
location = os.getcwd()
target = 'images\\{}\\{}'.format(product,experiment)
path = os.path.join(location, target)
#df_pivoted = xdf.pivot_table(index='year',columns='exporter',values='export_value',aggfunc='sum')
fig1, ax1 = plt.subplots(figsize=(30,15))
#https://matplotlib.org/3.1.0/gallery/color/named_colors.html
colors = ["gray","brown","darkviolet","bisque","b","g","r","c","gold","darkolivegreen","m","teal","rosybrown","y","orange","k","fuchsia","lavender","maroon","lime",
"olive","salmon","peru","aqua"]
label='global trends in export rca values for engine parts'
df_grps.plot(ax=ax1,label=label,color=colors)
ax1.set_ylabel('Export rca values)')
fig1.suptitle('Global exports for {}'.format(product))
plt.legend(ncol=8,loc='best')
root = os.path.join(path, 'export rca trends')
plt.savefig(str(root))
#.plot(legend=False)#subplots=True, legend=False)
# Adapted from https://stackoverflow.com/questions/30942755/plotting-multiple-time-series-after-a-groupby-in-pandas
# Modified from https://www.programcreek.com/python/example/98021/matplotlib.dates.YearLocator
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
def plot_multiple_time_series(dframe, ts_label, grp_label, values_label, product,experiment=experiment,figsize=(30,15), title=None):
'''
Plots multiple time series on one chart by first grouping time series based on series names from column in dataframe
Parameters:
----------
dframe : timeseries Pandas dataframe
ts_label : string
The name of the df column that has the datetime timestamp x-axis values.
grp_label : string
The column name in dframe for groupby method.
values_label : string
The column name in dframe for the y-axis.
figsize : tuple of two integers
Figure size of the resulting plot, e.g. (20, 7)
title : string
Optional title
'''
xtick_locator = YearLocator()
xtick_dateformatter = DateFormatter('%Y')
fig, ax = plt.subplots(figsize=figsize)
location = os.getcwd()
target = 'images\\{}\\{}'.format(product,experiment)
path = os.path.join(location, target)
for key, grp in dframe.groupby([grp_label]):
ax = grp.plot(ax=ax, kind='line', x=ts_label, y=values_label, label=key, marker='o')
ax.xaxis.set_major_locator(xtick_locator)
ax.xaxis.set_major_formatter(xtick_dateformatter)
ax.autoscale_view()
ax.legend(ncol=5, loc='upper left')
#_ = plt.xticks(rotation=0, )
_ = plt.grid()
_ = plt.xlabel('year')
_ = plt.ylabel('Export rca value')
_ = plt.ylim(0, dframe[values_label].max() * 1.25)
if title is not None:
_ = plt.title(title)
savefile = os.path.join(path,label)
plt.savefig(str(savefile))
_ = plt.show()
plot_multiple_time_series(xdf, 'year', 'exporter', 'rca', 'engine_parts',experiment,title='Global export_rca trends for engine parts')
Reviewing plots of the density of observations can provide further insight into the structure of the data:
for exporter in exporters:
root = save_images(df_grps,'rca_histograms')
plt.figure(1);
plt.subplot(111)
plt.title('Histogram for {}'.format(exporter))
#sns.distplot(df_grps[exporter].values().index.year)
plt.hist(df_grps[exporter])
plt.ylabel('rca value')
plt.xlim(1962,2016)
plt.ylim(0,120)
plt.subplot(112)
plt.title('Density plot for {}'.format(exporter))
df_grps[exporter].plot(kind='kde')
plt.ylabel('Density')
display(plt.savefig(str(root)))
plt.show()
df_grps.info()
#plt.subplots(figsize=(15,6))
sns.distplot(xdf)
#savefile = os.path.join(path,'global trend')
#plt.savefig(savefile)
plt.show()
countries = df_grps.columns
Target folder to store individual images
def save_images(ts_groups,analysis):
'''
args: can be from these options ['decompose',acf_pacf','adf_test','rolling_stats']
'''
path = os.getcwd()+'\\images\\{}\\{}\\{}'.format(product,experiment,analysis)
savefile = os.path.join(path, ts_groups[country].name)
return savefile
Decomposing using statsmodel:
Decomposition
for country in countries:
analysis = 'decomposition'
savefile = save_images(df_grps,analysis)
decomposition = sm.tsa.seasonal_decompose(df_grps[country], model='additive')
rcParams['figure.figsize'] = 18, 8
fig = decomposition.plot()
plt.title('{}_Time series decomposition'.format(df_grps[country].name))
plt.savefig(str('{}_{}'.format(savefile,analysis)))
plt.show()
Notes from Alkaline ML
In order to quantitatively determine whether we need to difference our data in order to make it stationary, we can conduct a test of stationarity
We can check stationarity using the following:
Augmented Dickey-Fuller Test [*]:
If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary. Based on papers by:
.. [*] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003.
.. [*] Hamilton, J.D. "Time Series Analysis". Princeton, 1994.
.. [*] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for
unit-root and cointegration tests. Journal of Business and Economic Statistics 12, 167-76.
.. [*] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html
for country in countries:
savefile = save_images(df_grps,'acf_pacf')
plt.figure(figsize=(15,6))
plt.subplot(121)
plot_acf(df_grps[country], ax=pyplot.gca(),title="{} - Autocorrelation".format(df_grps[country].name))
plt.subplot(122);
plot_pacf(df_grps[country], ax=pyplot.gca(),title="{} - Partial Autocorrelation".format(df_grps[country].name))
plt.savefig(str('{}_ACF_PACF'.format(savefile)))
plt.show()
| ACF Shape | Indicated Model | |
|---|---|---|
| Exponential, decaying to zero | Autoregressive model. Use the partial autocorrelation plot to identify the order of the autoregressive model | |
| Alternating positive and negative, decaying to zero Autoregressive model. | Use the partial autocorrelation plot to help identify the order. | |
| One or more spikes, rest are essentially zero | Moving average model, order identified by where plot becomes zero. | |
| Decay, starting after a few lags | Mixed autoregressive and moving average (ARMA) model. | |
| All zero or close to zero | Data are essentially random. | |
| High values at fixed intervals | Include seasonal autoregressive term. | |
| No decay to zero | Series is not stationary |
cycles = int((len(df_grps))/5)
for country in countries:
savefile = save_images(df_grps,'rolling_stats')
plt.figure(figsize=(15,6))
rolmean = df_grps[country].rolling(2).mean()
rolstd = df_grps[country].rolling(cycles).std()
#Plot rolling statistics:
orig = plt.plot(df_grps[country], color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('{}: Rolling Mean & Standard Deviation'.format(df_grps[country].name))
plt.savefig(str('{}_Rolling_Stats'.format(savefile)))
plt.show(block=False)
Null hypothesis (Ho): The time series is not stationary, it presents heterocedasticity. In other words, the time series depends on itself (i.e.: yt depends on yt-1, yt-1 depends on yt-2
Alternative hypothesis(Ha): the series is stationary
from statsmodels.tsa.stattools import adfuller, kpss
def ADF_Stationarity_Test(ts_groups):
#countries = list(ts_groups.columns)
adf_results = {}
col_names=['Test Statistic','p_value','Lags used','Observations Used','1% Critical value','5% Critical value','10% Critical value','IC_Best']
data = pd.DataFrame(columns=col_names,index=countries)
for country in countries:
#Dickey-Fuller test
adf_results[country] = tsa.adfuller(ts_groups[country],autolag='AIC')
for country in adf_results:
data.loc[country] = pd.Series({'Test Statistic':adf_results[country][0],
'p_value':adf_results[country][1],
'Lags used':adf_results[country][2],
'Observations Used':adf_results[country][3],
'1% Critical value': adf_results[country][4]['1%'],
'5% Critical value': adf_results[country][4]['5%'],
'10% Critical value': adf_results[country][4]['10%'],
'IC_Best': adf_results[country][5]
})
return data.transpose()
adf_test = ADF_Stationarity_Test(df_grps)
adf_countries = adf_test.columns
adf_test['South Africa'].name
for country in adf_test:
savefile = save_images(adf_test,'adf_test')
adf_test[country].plot.bar()
plt.legend(loc='best')
plt.title('{} - ADF Results'.format(adf_test[country].name))
plt.savefig(str('{}_ADF.png'.format(savefile)))
plt.show()
#pyplot.figure(figsize=(15,6))
#Plot rolling statistics:
#adf_plot = plt.plot(adf_test[country])
#plt.legend(loc='best')
#plt.title('{}: ADF'.format(adf_test[country].name))
#pyplot.savefig(str('{}adf_test.png'.format(savefile)))
#pyplot.show(block=False)
There are 2 major reasons behind non-stationaruty of a TS:
ts_log = np.log(df_grps)
plt.plot(ts_log)
plt.show()
ts_log
moving_avg = ts_log.rolling(5).mean()
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
ma = []
for col in ts_log:
ts_log_moving_avg_diff = ts_log[col] - moving_avg[col]
#ma.append(ts_log_moving_avg_diff)
ts_log_moving_avg_diff.head(10)
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)
expwighted_avg = ts_log.ewm(com=0.5).mean()
#df.ewm(com=0.5).mean()
plt.plot(ts_log)
plt.plot(expwighted_avg)
plt.show()
ts_log_ewma_diff = ts_log- expwighted_avg
ts_log_ewma_diff
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
plt.show()
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)